# This is the input block -- a full bone Python Shell
print('Look: I will be shown on output block')
import pprint
from IPython.core.display import HTML
HTML('Logo of Initium Lab: <img src="%s">' % 'http://initiumlab.com/favicon-32x32.png')
# Display any HTML easily
my_html = '''
I'm going to show you:
<ul>
<li> PyReadability </li>
<li> PyQuery </li>
<li> ... </li>
</ul>
'''
HTML(my_html)
%%javascript
//IPython.OutputArea.auto_scroll_threshold = 9999;
IPython.OutputArea.prototype._should_scroll = function(){return false;}
# I'm going to insert some slides here
from IPython.core.display import Image
Hong Kong Legislative Council:
https://theinitium.com/article/20150812-hongkong-legcoanalysis/
http://legco.initiumlab.com/matrix

Hong Kong District Council Election:
https://theinitium.com/project/20151012-hk-district-council-elections/

https://theinitium.com/project/20151019-hk-district-council-elections-2/

https://theinitium.com/project/20151029-hk-district-council-elections-3/

Image('assets/venn-skillset.png')
Image('assets/workflow-highlight-data-collection.png')
print('screenshot from: https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp')
Image('assets/rgc-official-site.png')
print('e.g. # of Hong Kong v.s. Non Hong Kong studies (Social Science)')
print('(Just draft labeling! -- cite the figure at your own risk)')
Image('assets/hk-non-hk-studies-humanities.png')
%%sh
ls -1
%%sh
curl -s 'http://initiumlab.com/' | head -n 8
import requests
html = requests.get('http://initiumlab.com/').content
html[:500]
Not an easy task, generally:
%%sh
curl -s 'http://initiumlab.com/' | grep title
%%sh
curl -s 'http://initiumlab.com/' | grep '<title'
[Y] items will be involved in this talk.
For mature project, you usually loop between Download and Parse, e.g. scrapy is a widely used framework.
Keywords of this demo:
Human-friendly command tool written in Python
%%sh
http get http://initiumlab.com | head -n 50 | tail -n 10
No. Nearly seamless integration:
lines = !http get http://initiumlab.com
lines[0:10]
':' HTTP headers:
Referer:http://httpie.org Cookie:foo=bar User-Agent:bacon/1.0
'==' URL parameters to be appended to the request URI:
search==httpie
'=' Data fields to be serialized into a JSON object (with --json, -j)
or form data (with --form, -f):
name=HTTPie language=Python description='CLI HTTP client'
':=' Non-string JSON data fields (only with --json, -j):
awesome:=true amount:=42 colors:='["red", "green", "blue"]'
'@' Form file fields (only with --form, -f):
cs@~/Documents/CV.pdf
'=@' A data field like '=', but takes a file path and embeds its content:
essay=@Documents/essay.txt
':=@' A raw JSON field like ':=', but takes a file path and embeds its content:
package:=@./package.json
You can use a backslash to escape a colliding separator in the field name:
field-name-with\:colon=value
%%sh
http get 'http://httpbin.org/get' name==hupili at=='Scrape more with less codes!'
%%sh
http post 'http://httpbin.org/post' name==hupili at=='Scrape more with less codes!'
Caveats: --ignore-stdin is required in IPython notebook
Not a problem in command-line env.
Related issues: https://github.com/jkbrzt/httpie/issues/150
%%sh
http --form --ignore-stdin post 'http://httpbin.org/post' name==hupili at='Scrape more with less codes!'
Note: "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",
%%sh
http --form --ignore-stdin post 'http://httpbin.org/post' name=hupili at='Scrape more with less codes!'
Note: "Content-Type": "application/json",
%%sh
http --ignore-stdin post 'http://httpbin.org/post' name=hupili at='Scrape more with less codes!'
%%sh
http get https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp | head -n 10
%%sh
http get https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp | grep 155
Needs to find out why it doesn't give us the links
Image('assets/rgc-search-network-trace.png')
Now use HTTPie to easily construct the query
%%sh
http post https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp | grep 155
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' | grep 155
html_lines = !http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1'
html_lines_with_a = list(filter(lambda l: '<A' in l, html_lines))
html_lines_with_a[:5]
A wrap around pyQuery -- a Python library that allow you manipulate HTML in jQuery style.
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| grep 155
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| grep '<a'
Ignore case
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| grep -i '<a' | head -n 5
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| grep -i '<a' | grep -o 'HREF=".*"' | head -n 5
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| grep -i '<a' | grep -o 'HREF=".*"' | cut -d'"' -f2 | head -n 5
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| pquery 'a' | head -n 5
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| pquery 'a' -p href | head -n 5
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| pquery 'a' -p href | wc -l
Image('assets/rgc-index-list.png')
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| pquery "table td[align='right'] a" -p href | wc -l
Scrape the info of 60 data science books and visualise their connection: http://www.kdnuggets.com/2015/09/free-data-science-books.html
%%sh
http --body 'http://www.kdnuggets.com/2015/09/free-data-science-books.html' |\
pquery '.three_ul li strong a' -f '"{text}",{href}' |\
head -n 8
Image('assets/data-science-books-graph.png')
%%sh
http --ignore-stdin --form post 'https://cerg1.ugc.edu.hk/cergprod/scrrm00541.jsp' 'mode=search' 'sScheme=1' \
| pquery "table td[align='right'] a" -p href > path-list.txt
Next, let's download them all
%%sh
tail -n 1 path-list.txt | xargs -I{} http "https://cerg1.ugc.edu.hk/cergprod/{}" \
| pquery 'table.styleTableContent' -p html | head -n 10
%%sh
tail -n 1 path-list.txt | xargs -I{} http "https://cerg1.ugc.edu.hk/cergprod/{}" \
| pquery 'table.styleTableContent' -p html | pquery 'td' -p text
%time page_lines = !tail -n 10 path-list.txt | xargs -I{} http "https://cerg1.ugc.edu.hk/cergprod/{}"
%time page_lines = !tail -n 10 path-list.txt | xargs -I{} -P5 http "https://cerg1.ugc.edu.hk/cergprod/{}"
A) My early dirty work: https://github.com/hupili/Lightweight-Distributing-Toolset
In Perl. 4 years ago. Do not use
B) GNU Parallel: http://www.gnu.org/software/parallel/
Written in Perl. Only need SSH access to remote (or local machine)
Cool, but...
C) PSSH: https://code.google.com/p/parallel-ssh/
%%file hosts
localhost
%%sh
cat hosts
%%sh
pssh -h hosts -o output/ 'echo hello PSSH'
%%sh
ls output/
%%sh
cat output/localhost
Easier for tabulared data.
%%sh
tail -n 1 path-list.txt | xargs -I{} http "https://cerg1.ugc.edu.hk/cergprod/{}" \
| pquery 'table.styleTableContent' -p html | head -n 5
table_html = !tail -n 1 path-list.txt | xargs -I{} http "https://cerg1.ugc.edu.hk/cergprod/{}" | pquery 'table.styleTableContent' -p html
import pandas as pd
df_projects = pd.read_html('<table>%s</table>' % '\n'.join(table_html))
df_projects[0]
We use a version ported to Python3:
https://github.com/hyperlinkapp/python-readability
(already included in the reuqirements.txt file)
from readability.readability import Document
import requests
html = requests.get('http://initiumlab.com/blog/20150922-jackathon3-review/').content
readable_article = Document(html).summary()
readable_title = Document(html).short_title()
print(readable_article[:1000])
HTML(readable_article[:1000])
Let's fix the above URL problems
import pyquery
r = pyquery.PyQuery(readable_article)
r('p')
r('video').attr('poster')
r('video source').attr('src')
r('video').attr('poster', 'http://initiumlab.com/%s' % r('video').attr('poster'))
r('video').attr('poster')
r('video source').attr('src', 'http://initiumlab.com/%s' % r('video source').attr('src'))
r('video source').attr('src')
r.html()[:1000]
%%javascript
//IPython.OutputArea.auto_scroll_threshold = 9999;
IPython.OutputArea.prototype._should_scroll = function(){return false;}
HTML(r.html()[:1000])
from scrapely import Scraper
s = Scraper()
help(s.train)
from urllib import parse
def get_localhost_url(url):
filename = parse.quote_plus(url)
fullpath = 'tmp/%s' % filename
html = requests.get(url).content
open(fullpath, 'wb').write(html)
return 'http://localhost:8888/files/%s?download=1' % parse.quote_plus(fullpath)
training_url = 'http://initiumlab.com/blog/20150916-legco-eng/'
training_data = {'title': 'Legco Matrix Brief (English)',
'author': 'Initium Lab',
'date': '2015-09-16'}
s.train(get_localhost_url(training_url), training_data)
testing_url = 'http://initiumlab.com/blog/20150901-data-journalism-for-the-blind/'
s.scrape(get_localhost_url(testing_url))
testing_url = 'http://initiumlab.com/blog/20150922-jackathon3-review/'
s.scrape(get_localhost_url(testing_url))
testing_url = 'http://initiumlab.com/blog/20151015-3d-infographic-user-testing/'
s.scrape(get_localhost_url(testing_url))
blogs = !http get http://initiumlab.com/blog/ | pquery 'a.post-title-link' -p href
blogs
infos = []
for b in blogs:
infos.extend(s.scrape(get_localhost_url('http://initiumlab.com/blog/' + b)))
infos
import pandas as pd
df_blogs = pd.DataFrame(infos)
df_blogs['title'] = df_blogs['title'].apply(lambda x: x[0].strip())
df_blogs
Theme: scrape more with less codes
Keywords: quick and dirty hacks
Environment:
Human friendly HTTP interface:
HTTPierequestsScale-out
xargs -PpsshManual parse:
pQuerypyQuery for FE peoplepandas useful for tabulared dataAutomatic parse, in Python REPL:
PyReadability: the main body of a pagescraply: learn patterns from your labelling